This notebook and analysis packages are based on R version
4.3+.
The installation of packages has not been automated due to multiple
dependencies from CRAN and Bioconductor.
It is suggested the packages be installed with supervision to fulfil all
dependencies.
Additionally, the celldex package on first-use will prompt
for a cache directory.
NOTE: Running this entire notebook can take 30 to 40 minutes to complete, i.e., Section 5 is most time consuming.
#Bioconductor packages
library(SingleCellExperiment)
library(celldex)
library(SingleR)
# CRAN packages
library(stringr)
library(Seurat)
library(SeuratObject)
If running on macOS or Windows, the following OS functions are
needed: wget and tar
All the data will be downloaded in the current working directory.
system('wget -q https://ftp.ncbi.nlm.nih.gov/geo/series/GSE96nnn/GSE96583/suppl/GSE96583_RAW.tar')
system('wget -q https://ftp.ncbi.nlm.nih.gov/geo/series/GSE96nnn/GSE96583/suppl/GSE96583_batch2.genes.tsv.gz')
system('wget -q https://raw.githubusercontent.com/yelabucsf/demuxlet_paper_code/master/fig3/ye1.ctrl.8.10.sm.best')
system('wget -q https://raw.githubusercontent.com/yelabucsf/demuxlet_paper_code/master/fig3/ye2.stim.8.10.sm.best')
system('tar -xf GSE96583_RAW.tar')
system('rm GSE96583_RAW.tar GSM2560245* GSM2560246* GSM2560247*')
# Load data from the extracted sparse matrix files
ctrl_cells = CreateSeuratObject(
ReadMtx(
mtx = 'GSM2560248_2.1.mtx.gz', features = 'GSE96583_batch2.genes.tsv.gz',
cells = 'GSM2560248_barcodes.tsv.gz', strip.suffix = T
),
project = 'Ctrl'
)
stim_cells = CreateSeuratObject(
ReadMtx(
mtx = 'GSM2560249_2.2.mtx.gz', features = 'GSE96583_batch2.genes.tsv.gz',
cells = 'GSM2560249_barcodes.tsv.gz', strip.suffix = T
),
project = 'Stim'
)
# Load output of demuxlet tool with droplet annotation
ctrl_dmx = read.table('./ye1.ctrl.8.10.sm.best', sep = '\t', header = T)
stim_dmx = read.table('./ye2.stim.8.10.sm.best', sep = '\t', header = T)
# Select only singlets and filter out doublets and ambiguous droplets
ctrl_keep = ctrl_dmx[str_detect(ctrl_dmx$BEST, 'SNG'), ]
stim_keep = stim_dmx[str_detect(stim_dmx$BEST, 'SNG'), ]
# Check filtered ratio
nrow(ctrl_keep)/nrow(ctrl_dmx)
## [1] 0.8913058
nrow(stim_keep)/nrow(stim_dmx)
## [1] 0.8868891
The proportion of doublets (~0.11) matches what is noted in the article.
ctrl_fltr = ctrl_cells[ , colnames(ctrl_cells) %in% str_remove(ctrl_keep$BARCODE, '-1')]
ctrl_fltr
## An object of class Seurat
## 35635 features across 13030 samples within 1 assay
## Active assay: RNA (35635 features, 0 variable features)
stim_fltr = stim_cells[ , colnames(stim_cells) %in% str_remove(stim_keep$BARCODE, '-1')]
stim_fltr
## An object of class Seurat
## 35635 features across 12812 samples within 1 assay
## Active assay: RNA (35635 features, 0 variable features)
# Add Sample ID
ctrl_fltr@meta.data$sample = ctrl_keep$SNG.1ST
stim_fltr@meta.data$sample = stim_keep$SNG.1ST
Beginning with a combined matrix for downstream analysis.
# Add experiment prefix to Cell IDs while merging
com_cells = merge(x = ctrl_fltr, y = stim_fltr, add.cell.ids = c('Ctrl-', 'Stim-'))
# Perform typical QC for single-cell RNA-seq
# Calculate specific metrics since it is PBMC data
# Mitochondrial, Ribosomal and Haemoglobin genes
com_cells = PercentageFeatureSet(com_cells, '^MT-', col.name = 'percent.mt')
com_cells = PercentageFeatureSet(com_cells, '^RP[SL]', col.name = 'percent.rb')
com_cells = PercentageFeatureSet(com_cells, '^HB[^(P)]', col.name = 'percent.hb')
# Pre-filtering QC metrics
VlnPlot(com_cells, group.by = 'sample', pt.size = 0,
features = c('nFeature_RNA', 'nCount_RNA', 'percent.mt', 'percent.rb', 'percent.hb'), ncol = 3)
It is observed that the mitochondrial genes have no expression.
# Verifying MT genes
mito_genes = sort(rownames(com_cells)[grep('^MT-', rownames(com_cells))])
mito_genes
## [1] "MT-ATP6" "MT-ATP8" "MT-CO1" "MT-CO2" "MT-CO3" "MT-CYB" "MT-ND1"
## [8] "MT-ND2" "MT-ND3" "MT-ND4" "MT-ND4L" "MT-ND5" "MT-ND6"
The mitochondrial gene names are as expected and all 13 exist, but
the counts are indeed 0.
Rectifying this would require starting from FASTQ files and
quantification to obtain mitochondrial gene expression.
Since this is a peer-reviewed study and authors have accounted for
single-cell sequencing challenges continuing with typical QC.
dim(com_cells)
## [1] 35635 25842
table(com_cells$orig.ident)
##
## Ctrl Stim
## 13030 12812
table(com_cells$sample)
##
## 101 107 1015 1016 1039 1244 1256 1488
## 2101 1096 5235 3611 1155 3564 4253 4827
# Keep cells that have genes quantified between 200 and 2500
# And have ribosomal genes % > 5 and haemoglobin genes % < 2.5
sel_cells = WhichCells(com_cells, expression = nFeature_RNA > 200 & nFeature_RNA < 2500 &
percent.rb > 5 & percent.hb < 2.5)
# Keep genes that are expressed in at least 5 cells
sel_genes = rownames(com_cells)[rowSums(com_cells) > 5]
com_cells = subset(com_cells, cells = sel_cells, features = sel_genes)
dim(com_cells)
## [1] 14460 24362
table(com_cells$orig.ident)
##
## Ctrl Stim
## 12828 11534
table(com_cells$sample)
##
## 101 107 1015 1016 1039 1244 1256 1488
## 1966 969 4878 3558 945 3420 4017 4609
# Post-filtering QC metrics
VlnPlot(com_cells, group.by = 'sample', pt.size = 0,
features = c('nFeature_RNA', 'nCount_RNA', 'percent.rb', 'percent.hb'), ncol = 2)
~94% of combined cell dataset still remains.
Noise/heterogeneity introduced due to many cycling cells can cause issues for further analysis.
com_cells = NormalizeData(com_cells, verbose = F)
com_cells = CellCycleScoring(com_cells, g2m.features = cc.genes$g2m.genes, s.features = cc.genes$s.genes)
VlnPlot(com_cells, group.by = 'sample', pt.size = 0, features = c('S.Score', 'G2M.Score'))
Cycling cells make up a very small population and do not need filtering.
Perform PCA, embedding and cluster identification with mostly default
parameters.
Tuning certain parameters iteratively can provide better embedding and
clustering (which is not performed below).
# Find top 2000 variable genes
com_cells = FindVariableFeatures(com_cells, verbose = F)
# Scale and center top features
com_cells = ScaleData(com_cells, verbose = F)
# Perform PCA with 30 components and then UMAP
com_cells = RunPCA(com_cells, npcs = 30, verbose = F)
com_cells = RunUMAP(com_cells, reduction = 'pca', dims = 1:30, verbose = F)
# Construct a nearest neighbors graph for graph clustering
com_cells = FindNeighbors(com_cells, reduction = 'pca', dims = 1:30, verbose = F)
com_cells = FindClusters(com_cells, resolution = 0.5, verbose = F)
# Plot 2D embedding with Ctrl and Stim
UMAPPlot(com_cells, group.by = 'orig.ident') + ggplot2::ggtitle(label = '')
# Plot 2D embedding with identified clusters
UMAPPlot(com_cells, label = T) + NoLegend()
The primary separation of cell groups is by treatment as seen in Fig.
3a of the article.
Additionally, the clustering is occuring within Ctrl and Stim groups.
This makes it challenging to identify cell type gene markers.
Thus, it would be better to harmonize the Control
and Stimulated cells.
This would help to:
1. Obtain low-dimensional representation and clustering without
treatment effects
2. Find gene markers across clusters irrespective of IFN-β
stimulation
3. Find genes that are up/down regulated by stimulation across
clusters
This relies on Seurat’s anchoring capabilities.
# Using the previously filtered cell matrices and split by sample
ctrl_fltr = SplitObject(ctrl_fltr, split.by = 'sample')
stim_fltr = SplitObject(stim_fltr, split.by = 'sample')
# Normalization and top features for each sample
ctrl_fltr = lapply(X = ctrl_fltr, FUN = function(x) {
x = NormalizeData(x, verbose = F)
x = FindVariableFeatures(x, verbose = F)
})
stim_fltr = lapply(X = stim_fltr, FUN = function(x) {
x = NormalizeData(x, verbose = F)
x = FindVariableFeatures(x, verbose = F)
})
WARNING
The below code chunk can take 20 to 30 minutes to complete.
anchors = FindIntegrationAnchors(c(ctrl_fltr, stim_fltr), verbose = F)
int_cells = IntegrateData(anchors, verbose = F)
int_cells
## An object of class Seurat
## 37635 features across 25842 samples within 2 assays
## Active assay: integrated (2000 features, 2000 variable features)
## 1 other assay present: RNA
# QC for integrated data
DefaultAssay(int_cells) = 'RNA'
int_cells = PercentageFeatureSet(int_cells, '^RP[SL]', col.name = 'percent.rb')
int_cells = PercentageFeatureSet(int_cells, '^HB[^(P)]', col.name = 'percent.hb')
sel_cells = WhichCells(int_cells, expression = nFeature_RNA > 200 & nFeature_RNA < 2500 &
percent.rb > 5 & percent.hb < 2.5)
sel_genes = rownames(int_cells)[rowSums(int_cells) > 5]
int_cells = subset(int_cells, cells = sel_cells, features = sel_genes)
VlnPlot(int_cells, group.by = 'sample', pt.size = 0,
features = c('nFeature_RNA', 'nCount_RNA', 'percent.rb', 'percent.hb'), ncol = 2)
int_cells
## An object of class Seurat
## 17759 features across 24362 samples within 2 assays
## Active assay: RNA (15759 features, 0 variable features)
## 1 other assay present: integrated
DefaultAssay(int_cells) = 'integrated'
int_cells = ScaleData(int_cells, verbose = F)
int_cells = RunPCA(int_cells, npcs = 30, verbose = F)
int_cells = RunUMAP(int_cells, reduction = 'pca', dims = 1:30, verbose = F)
int_cells = FindNeighbors(int_cells, reduction = 'pca', dims = 1:30, verbose = F)
int_cells = FindClusters(int_cells, resolution = 0.5, verbose = F)
UMAPPlot(int_cells, group.by = 'orig.ident') + ggplot2::ggtitle(label = '')
UMAPPlot(int_cells, label = T) + NoLegend()
UMAPPlot(int_cells, label = T, split.by = 'orig.ident') + NoLegend()
Integration has yielded a harmonized set of cells, in which embedding
and clustering is not specific to treatment group.
There are 14 clusters in the whole dataset.
DefaultAssay(int_cells) = 'RNA'
# Get top 5 markers for all clusters
get_cons_mrkrs = function(cluster) {
df = FindConservedMarkers(int_cells, ident.1 = cluster,
grouping.var = 'orig.ident',
only.pos = T, verbose = F,
logfc.threshold = 0.25, min.pct = 0.25, min.diff.pct = 0.25)
# Helps to speed up and limit to most significant genes
df$avg_lfc = (df$Stim_avg_log2FC + df$Stim_avg_log2FC) / 2
df = df[order(df$minimump_p_val), ]
df = df[order(df$avg_lfc, decreasing = T), ]
print(paste0('Cluster-', cluster))
print(rownames(df)[1:5])
return(rownames(df)[1:5])
}
clu_mrkrs = c()
max_clu = max(as.numeric(int_cells$seurat_clusters)) - 1
for (i in 0:max_clu) {
clu_mrkrs = c(clu_mrkrs, get_cons_mrkrs(i))
}
## [1] "Cluster-0"
## [1] "SELL" "LTB" "CCR7" "GIMAP7" "LDHB"
## [1] "Cluster-1"
## [1] "CCL2" "C15orf48" "TIMP1" "SOD2" "LYZ"
## [1] "Cluster-2"
## [1] "TRAT1" "ZFP36L2" "CREM" NA NA
## [1] "Cluster-3"
## [1] "CCL5" "GZMH" "CD8A" "CD8B" "CST7"
## [1] "Cluster-4"
## [1] "CD79A" "CD74" "MS4A1" "HLA-DRA" "CD83"
## [1] "Cluster-5"
## [1] "GNLY" "GZMB" "PRF1" "NKG7" "CLIC3"
## [1] "Cluster-6"
## [1] "CACYBP" "GADD45B" "SRSF2" "HSPH1" "CD69"
## [1] "Cluster-7"
## [1] "VMO1" "FCGR3A" "MS4A7" "MS4A4A" "CXCL16"
## [1] "Cluster-8"
## [1] "MIR155HG" "HLA-DQA1" "MYC" "CD83" "ID3"
## [1] "Cluster-9"
## [1] "TXN" "HLA-DPB1" "HLA-DRA" "MARCKSL1" "LYZ"
## [1] "Cluster-10"
## [1] "PPBP" "PF4" "GNG11" "SDPR" "NRGN"
## [1] "Cluster-11"
## [1] "TXN" "TSPAN13" "ITM2C" "SEC61B" "IGJ"
# Select the top 1 marker for Cluster 0 and 1 respectively for visualization across the embedding
FeaturePlot(int_cells, features = clu_mrkrs[1], min.cutoff = 'q9')
FeaturePlot(int_cells, features = clu_mrkrs[6], min.cutoff = 'q9')
int_cells$id_bkp = Idents(int_cells)
Idents(int_cells) = 'orig.ident'
stim_mrkrs = FindMarkers(int_cells, ident.1 = 'Stim', ident.2 = 'Ctrl', only.pos = TRUE,
logfc.threshold = 0.5, min.pct = 0.5, min.diff.pct = 0.5, verbose = FALSE)
# Helps to speed up and limit to most significant genes
head(stim_mrkrs)
## p_val avg_log2FC pct.1 pct.2 p_val_adj
## ISG15 0 5.050741 0.984 0.215 0
## IFI6 0 3.816508 0.919 0.120 0
## IFI44L 0 2.589310 0.544 0.034 0
## RSAD2 0 3.425366 0.560 0.019 0
## TNFSF10 0 3.383802 0.623 0.069 0
## LY6E 0 3.175034 0.856 0.143 0
# Visualize top gene expression profile across conditions
VlnPlot(int_cells, features = rownames(stim_mrkrs)[1], split.by = 'orig.ident', group.by = 'seurat_clusters',
split.plot = T, pt.size = 0)
# Using the previously combined cells with distinct Ctrl and Stim groups to see gene expression
FeaturePlot(com_cells, features = rownames(stim_mrkrs)[1], shape.by = 'orig.ident', pt.size = 1, min.cutoff = 'q9')
Cell type identification can be performed by annotating the clusters
with prior knowledge of the above identified gene markers.
This annotation typically requires expert knowledge and/or literature
review.
Lately, there are tools available to query a RNA-seq data against a
reference RNA-seq dataset to find cell labels.
# Load cell type data from ENCODE
ref_data = BlueprintEncodeData()
sce = as.SingleCellExperiment(int_cells, assay = 'RNA')
# Use the ENCODE primary cell labels to predict single-cell labels
pred_types = SingleR(test = sce, ref = ref_data, labels = ref_data$label.main)
table(pred_types$labels)
##
## B-cells CD4+ T-cells CD8+ T-cells DC HSC Macrophages
## 2342 7573 5523 423 235 259
## Monocytes NK cells
## 5736 2271
# Visualize predicted labels against clusters
tab = table(Assigned = pred_types$labels, Cluster = sce@colData$seurat_clusters)
pheatmap::pheatmap(log2(tab + 10), color = colorRampPalette(c('white', 'blue'))(101))
int_cells@meta.data$pred_labels = pred_types$labels
UMAPPlot(int_cells, group.by = 'pred_labels', label = T) + ggplot2::ggtitle(label = '')
Most of the predicted cell labels match the cell types identified by the authors. Although they do not precisely match all the previously identified clusters directly. This can be improved by iteratively labeling clusters either by expert knowledge or through tools + reference datasets.
system('rm GSM2560248_2.1.mtx.gz GSM2560248_barcodes.tsv.gz GSM2560249_2.2.mtx.gz GSM2560249_barcodes.tsv.gz')
system('rm GSE96583_batch2.genes.tsv.gz ye1.ctrl.8.10.sm.best ye2.stim.8.10.sm.best')